whether a particular booking which was done by user is going to be canceled or not?
- Where do guests come from?
- How much do guests pay for a night?
- How does the price per night vary over the year?
- Which are the most busy months or in which months Guests are high?
- How long do people stay at the hotels?
- How long do people stay at the hotels?
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df0 = pd.read_csv("hotel_bookings.csv")
print(df0.shape)
df0.head()
(119390, 32)
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 7/3/2015 |
5 rows × 32 columns
df0.columns
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
'country', 'market_segment', 'distribution_channel',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
'company', 'days_in_waiting_list', 'customer_type', 'adr',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
df0.isna().sum()
hotel 0 is_canceled 0 lead_time 0 arrival_date_year 0 arrival_date_month 0 arrival_date_week_number 0 arrival_date_day_of_month 0 stays_in_weekend_nights 0 stays_in_week_nights 0 adults 0 children 4 babies 0 meal 0 country 488 market_segment 0 distribution_channel 0 is_repeated_guest 0 previous_cancellations 0 previous_bookings_not_canceled 0 reserved_room_type 0 assigned_room_type 0 booking_changes 0 deposit_type 0 agent 16340 company 112593 days_in_waiting_list 0 customer_type 0 adr 0 required_car_parking_spaces 0 total_of_special_requests 0 reservation_status 0 reservation_status_date 0 dtype: int64
# taking care of missing values:
df0.fillna(0,inplace = True)
# converting the reservation_status_date to date/time format:
df0["reservation_status_date"] = pd.to_datetime(df0["reservation_status_date"])
#finding the number of unique values:
temp_list = ['adults', 'children','babies']
for i in temp_list:
print(f"{i} has unique values as: \t {df0[i].unique()}")
adults has unique values as: [ 2 1 3 4 40 26 50 27 55 0 20 6 5 10] children has unique values as: [ 0. 1. 2. 10. 3.] babies has unique values as: [ 0 1 2 10 9]
filter = (df0["children"] == 0) & (df0["adults"] == 0) & (df0["babies"] == 0)
df0[filter].shape
(180, 32)
There are 180 rows (adults, children, babies) with zeros at the same time that need to be excluded from the dataset.
df = df0[~filter] #excluding the filtered data by using "~" sign
#spatial analysis:
#where did the guests come from?
country_wise_data = df[df['is_canceled'] == 0]['country'].value_counts().reset_index()
country_wise_data.columns = ["country","no. of guests"]
country_wise_data
| country | no. of guests | |
|---|---|---|
| 0 | PRT | 20977 |
| 1 | GBR | 9668 |
| 2 | FRA | 8468 |
| 3 | ESP | 6383 |
| 4 | DEU | 6067 |
| ... | ... | ... |
| 161 | SMR | 1 |
| 162 | GUY | 1 |
| 163 | NCL | 1 |
| 164 | NPL | 1 |
| 165 | MLI | 1 |
166 rows × 2 columns
# !pip install folium
# !pip install plotly
import folium
from folium.plugins import HeatMap
import plotly.express as px
basemap = folium.Map()
map_guest = px.choropleth(country_wise_data,
locations = country_wise_data["country"],
color = country_wise_data["no. of guests"],
hover_name = country_wise_data["country"],
title = "Home country of guests")
map_guest.show()
## we set a new dataset where "is_canceled" is 0, in order to assess the features:
data_0 = df[df["is_canceled"]==0]
# answeing the second question: How much do guests pay for a night?
plt.figure(figsize = (12,8))
sns.boxplot(x="reserved_room_type", y = "adr", hue = "hotel", data = data_0)
plt.title("Price of room types per night & per person")
plt.xlabel = ("Room Type")
plt.ylabel = ("Price(Euro)")
plt.legend()
plt.show();
# We are going to check the prices per night separately for Resort and City hotels;
resort_hotel_data = df[(df["hotel"] == "Resort Hotel") & (df["is_canceled"]== 0)]
city_hotel_data = df[(df["hotel"] == "City Hotel") & (df["is_canceled"]== 0)]
resort_hotel_data.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
5 rows × 32 columns
city_hotel_data.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 40060 | City Hotel | 0 | 6 | 2015 | July | 27 | 1 | 0 | 2 | 1 | ... | No Deposit | 6.0 | 0.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 2015-07-03 |
| 40066 | City Hotel | 0 | 3 | 2015 | July | 27 | 2 | 0 | 3 | 1 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 58.67 | 0 | 0 | Check-Out | 2015-07-05 |
| 40070 | City Hotel | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 2015-07-05 |
| 40071 | City Hotel | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 43.00 | 0 | 0 | Check-Out | 2015-07-05 |
| 40072 | City Hotel | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 2015-07-05 |
5 rows × 32 columns
# grouping each of the generated dataframes by "month":
resort_hotel_price = resort_hotel_data.groupby(["arrival_date_month"])["adr"].mean().reset_index()
city_hotel_price = city_hotel_data.groupby(["arrival_date_month"])["adr"].mean().reset_index()
# merging the two dataframes:
final_data = resort_hotel_price.merge(city_hotel_price, on= "arrival_date_month")
final_data.columns = ["month","price_for_resort","price_for_city_hotel"]
resort_hotel_price
| arrival_date_month | adr | |
|---|---|---|
| 0 | April | 75.867816 |
| 1 | August | 181.205892 |
| 2 | December | 68.410104 |
| 3 | February | 54.147478 |
| 4 | January | 48.761125 |
| 5 | July | 150.122528 |
| 6 | June | 107.974850 |
| 7 | March | 57.056838 |
| 8 | May | 76.657558 |
| 9 | November | 48.706289 |
| 10 | October | 61.775449 |
| 11 | September | 96.416860 |
city_hotel_price
| arrival_date_month | adr | |
|---|---|---|
| 0 | April | 111.962267 |
| 1 | August | 118.674598 |
| 2 | December | 88.401855 |
| 3 | February | 86.520062 |
| 4 | January | 82.330983 |
| 5 | July | 115.818019 |
| 6 | June | 117.874360 |
| 7 | March | 90.658533 |
| 8 | May | 120.669827 |
| 9 | November | 86.946592 |
| 10 | October | 102.004672 |
| 11 | September | 112.776582 |
final_data.head(12)
# notice that the table is not sorted (by month):
| month | price_for_resort | price_for_city_hotel | |
|---|---|---|---|
| 0 | April | 75.867816 | 111.962267 |
| 1 | August | 181.205892 | 118.674598 |
| 2 | December | 68.410104 | 88.401855 |
| 3 | February | 54.147478 | 86.520062 |
| 4 | January | 48.761125 | 82.330983 |
| 5 | July | 150.122528 | 115.818019 |
| 6 | June | 107.974850 | 117.874360 |
| 7 | March | 57.056838 | 90.658533 |
| 8 | May | 76.657558 | 120.669827 |
| 9 | November | 48.706289 | 86.946592 |
| 10 | October | 61.775449 | 102.004672 |
| 11 | September | 96.416860 | 112.776582 |
# #sorting the "final_data" by month using additional packages:
# !pip install sorted-months-weekdays
# !pip install sort-dataframeby-monthorweek
# ;
import sort_dataframeby_monthorweek as sd
# sorting the data by month:
final_data = sd.Sort_Dataframeby_Month(final_data, "month")
final_data.head(12)
| month | price_for_resort | price_for_city_hotel | |
|---|---|---|---|
| 0 | January | 48.761125 | 82.330983 |
| 1 | February | 54.147478 | 86.520062 |
| 2 | March | 57.056838 | 90.658533 |
| 3 | April | 75.867816 | 111.962267 |
| 4 | May | 76.657558 | 120.669827 |
| 5 | June | 107.974850 | 117.874360 |
| 6 | July | 150.122528 | 115.818019 |
| 7 | August | 181.205892 | 118.674598 |
| 8 | September | 96.416860 | 112.776582 |
| 9 | October | 61.775449 | 102.004672 |
| 10 | November | 48.706289 | 86.946592 |
| 11 | December | 68.410104 | 88.401855 |
#
px.line(final_data,
x = "month",
y = ["price_for_resort", "price_for_city_hotel"],
title = "Room price per night over the months")
# Which months are the busiest?
# we use "arrival_date_month" column and count the values:
resort_hotel_arrival = resort_hotel_data["arrival_date_month"].value_counts().reset_index()
resort_hotel_arrival.columns = ["month", "no. of guests"] #changing the columns names for better demonstration
city_hotel_arrival = city_hotel_data["arrival_date_month"].value_counts().reset_index()
city_hotel_arrival.columns = ["month", "no. of guests"] #changing the columns names for better demonstration
resort_hotel_arrival
| month | no. of guests | |
|---|---|---|
| 0 | August | 3257 |
| 1 | July | 3137 |
| 2 | October | 2575 |
| 3 | March | 2571 |
| 4 | April | 2550 |
| 5 | May | 2535 |
| 6 | February | 2308 |
| 7 | September | 2102 |
| 8 | June | 2037 |
| 9 | December | 2014 |
| 10 | November | 1975 |
| 11 | January | 1866 |
city_hotel_arrival
| month | no. of guests | |
|---|---|---|
| 0 | August | 5367 |
| 1 | July | 4770 |
| 2 | May | 4568 |
| 3 | June | 4358 |
| 4 | October | 4326 |
| 5 | September | 4283 |
| 6 | March | 4049 |
| 7 | April | 4010 |
| 8 | February | 3051 |
| 9 | November | 2676 |
| 10 | December | 2377 |
| 11 | January | 2249 |
# merging the two dataframes
final_arrival = resort_hotel_arrival.merge(city_hotel_arrival,on = "month")
final_arrival.columns = ["month", 'no. of guests in resort','no. of guests in city hotel']
final_arrival = sd.Sort_Dataframeby_Month(final_arrival,"month")
final_arrival
| month | no. of guests in resort | no. of guests in city hotel | |
|---|---|---|---|
| 0 | January | 1866 | 2249 |
| 1 | February | 2308 | 3051 |
| 2 | March | 2571 | 4049 |
| 3 | April | 2550 | 4010 |
| 4 | May | 2535 | 4568 |
| 5 | June | 2037 | 4358 |
| 6 | July | 3137 | 4770 |
| 7 | August | 3257 | 5367 |
| 8 | September | 2102 | 4283 |
| 9 | October | 2575 | 4326 |
| 10 | November | 1975 | 2676 |
| 11 | December | 2014 | 2377 |
px.line(data_frame = final_arrival,
x = "month",
y = ["no. of guests in resort", "no. of guests in city hotel"],
title='Total no of guests per Months')
The City hotel has more guests during spring and autumn, when the prices are also highest. In July and August there are less visitors, although prices are lower.
Guest numbers for the Resort hotel go down slighty from June to September, which is also when the prices are highest. Both hotels have the fewest guests during the winter.
#How long do people stay at the hotels?
# In order to answer to this question we combine the total nights of week days and weekends and we create a new column called 'total nights'
# we use data_0 which is the cleaned dataset where guests have not canceled their reservations; (is_canceled == 0)
data_0["total_nights"] = data_0 ["stays_in_week_nights"] + data_0 ["stays_in_weekend_nights"]
data_0.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | total_nights | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 | 0 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 | 0 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 | 1 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 | 1 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 | 2 |
5 rows × 33 columns
data_0.columns
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
'country', 'market_segment', 'distribution_channel',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
'company', 'days_in_waiting_list', 'customer_type', 'adr',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date', 'total_nights'],
dtype='object')
stay_data = data_0.groupby(["total_nights", "hotel"]).agg("count").reset_index()
stay_data = stay_data.iloc[:,0:3] #only keeping related columns
# we need to rename the "is_canceled" to "total numbers of stay"
stay_data = stay_data.rename(columns={"is_canceled":"total_numbers_of_stay"})
stay_data.head()
| total_nights | hotel | total_numbers_of_stay | |
|---|---|---|---|
| 0 | 0 | City Hotel | 251 |
| 1 | 0 | Resort Hotel | 371 |
| 2 | 1 | City Hotel | 9155 |
| 3 | 1 | Resort Hotel | 6579 |
| 4 | 2 | City Hotel | 10983 |
plt.figure(figsize=(21,10))
sns.barplot(x = 'total_nights', y = 'total_numbers_of_stay', hue = 'hotel',
hue_order = ['City Hotel', 'Resort Hotel'], data = stay_data)
plt.legend(loc ='right');
for corelations we use the cleaned dataset where only missing values and other abnormalities where taken care of;
df.corr()
| is_canceled | lead_time | arrival_date_year | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | booking_changes | agent | company | days_in_waiting_list | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| is_canceled | 1.000000 | 0.292876 | 0.016622 | 0.008315 | -0.005948 | -0.001323 | 0.025542 | 0.058182 | 0.004851 | -0.032569 | -0.083745 | 0.110139 | -0.057365 | -0.144832 | -0.046770 | -0.083594 | 0.054301 | 0.046492 | -0.195701 | -0.234877 |
| lead_time | 0.292876 | 1.000000 | 0.040334 | 0.127046 | 0.002306 | 0.085985 | 0.166892 | 0.117575 | -0.037878 | -0.021003 | -0.123209 | 0.086025 | -0.073599 | 0.002230 | -0.013114 | -0.085854 | 0.170008 | -0.065018 | -0.116624 | -0.095949 |
| arrival_date_year | 0.016622 | 0.040334 | 1.000000 | -0.540373 | -0.000121 | 0.021694 | 0.031203 | 0.030266 | 0.054710 | -0.013192 | 0.010281 | -0.119905 | 0.029234 | 0.031416 | 0.056438 | 0.033682 | -0.056348 | 0.198429 | -0.013812 | 0.108610 |
| arrival_date_week_number | 0.008315 | 0.127046 | -0.540373 | 1.000000 | 0.066572 | 0.018629 | 0.016047 | 0.026567 | 0.005556 | 0.010417 | -0.031125 | 0.035493 | -0.021009 | 0.006311 | -0.018225 | -0.032912 | 0.022677 | 0.076281 | 0.001980 | 0.026202 |
| arrival_date_day_of_month | -0.005948 | 0.002306 | -0.000121 | 0.066572 | 1.000000 | -0.016225 | -0.028362 | -0.001754 | 0.014550 | -0.000235 | -0.006471 | -0.027027 | -0.000306 | 0.011266 | 0.000159 | 0.003667 | 0.022532 | 0.030291 | 0.008569 | 0.003026 |
| stays_in_weekend_nights | -0.001323 | 0.085985 | 0.021694 | 0.018629 | -0.016225 | 1.000000 | 0.494175 | 0.094759 | 0.046135 | 0.018607 | -0.086009 | -0.012769 | -0.042859 | 0.050191 | 0.162411 | -0.080783 | -0.054399 | 0.050670 | -0.018520 | 0.073124 |
| stays_in_week_nights | 0.025542 | 0.166892 | 0.031203 | 0.016047 | -0.028362 | 0.494175 | 1.000000 | 0.096214 | 0.044652 | 0.020373 | -0.095302 | -0.013976 | -0.048873 | 0.080018 | 0.196777 | -0.044437 | -0.002026 | 0.066847 | -0.024933 | 0.068738 |
| adults | 0.058182 | 0.117575 | 0.030266 | 0.026567 | -0.001754 | 0.094759 | 0.096214 | 1.000000 | 0.029409 | 0.017890 | -0.140973 | -0.007070 | -0.108856 | -0.041472 | 0.023370 | -0.166182 | -0.008365 | 0.224253 | 0.014438 | 0.123353 |
| children | 0.004851 | -0.037878 | 0.054710 | 0.005556 | 0.014550 | 0.046135 | 0.044652 | 0.029409 | 1.000000 | 0.023999 | -0.032475 | -0.024755 | -0.021078 | 0.051000 | 0.050461 | -0.042554 | -0.033293 | 0.325057 | 0.056247 | 0.081747 |
| babies | -0.032569 | -0.021003 | -0.013192 | 0.010417 | -0.000235 | 0.018607 | 0.020373 | 0.017890 | 0.023999 | 1.000000 | -0.008813 | -0.007509 | -0.006552 | 0.085605 | 0.030235 | -0.009426 | -0.010627 | 0.029043 | 0.037389 | 0.097939 |
| is_repeated_guest | -0.083745 | -0.123209 | 0.010281 | -0.031125 | -0.006471 | -0.086009 | -0.095302 | -0.140973 | -0.032475 | -0.008813 | 1.000000 | 0.082740 | 0.420642 | 0.013044 | -0.051584 | 0.161871 | -0.022057 | -0.130807 | 0.077928 | 0.012963 |
| previous_cancellations | 0.110139 | 0.086025 | -0.119905 | 0.035493 | -0.027027 | -0.012769 | -0.013976 | -0.007070 | -0.024755 | -0.007509 | 0.082740 | 1.000000 | 0.152570 | -0.027261 | -0.018251 | -0.001110 | 0.005941 | -0.065974 | -0.018540 | -0.048488 |
| previous_bookings_not_canceled | -0.057365 | -0.073599 | 0.029234 | -0.021009 | -0.000306 | -0.042859 | -0.048873 | -0.108856 | -0.021078 | -0.006552 | 0.420642 | 0.152570 | 1.000000 | 0.011963 | -0.046348 | 0.111220 | -0.009416 | -0.072335 | 0.047506 | 0.037775 |
| booking_changes | -0.144832 | 0.002230 | 0.031416 | 0.006311 | 0.011266 | 0.050191 | 0.080018 | -0.041472 | 0.051000 | 0.085605 | 0.013044 | -0.027261 | 0.011963 | 1.000000 | 0.038555 | 0.089768 | -0.011916 | 0.026601 | 0.067490 | 0.055003 |
| agent | -0.046770 | -0.013114 | 0.056438 | -0.018225 | 0.000159 | 0.162411 | 0.196777 | 0.023370 | 0.050461 | 0.030235 | -0.051584 | -0.018251 | -0.046348 | 0.038555 | 1.000000 | -0.121333 | -0.041182 | 0.015711 | 0.119282 | 0.060783 |
| company | -0.083594 | -0.085854 | 0.033682 | -0.032912 | 0.003667 | -0.080783 | -0.044437 | -0.166182 | -0.042554 | -0.009426 | 0.161871 | -0.001110 | 0.111220 | 0.089768 | -0.121333 | 1.000000 | -0.022944 | -0.127641 | 0.038638 | -0.090790 |
| days_in_waiting_list | 0.054301 | 0.170008 | -0.056348 | 0.022677 | 0.022532 | -0.054399 | -0.002026 | -0.008365 | -0.033293 | -0.010627 | -0.022057 | 0.005941 | -0.009416 | -0.011916 | -0.041182 | -0.022944 | 1.000000 | -0.040859 | -0.030601 | -0.082755 |
| adr | 0.046492 | -0.065018 | 0.198429 | 0.076281 | 0.030291 | 0.050670 | 0.066847 | 0.224253 | 0.325057 | 0.029043 | -0.130807 | -0.065974 | -0.072335 | 0.026601 | 0.015711 | -0.127641 | -0.040859 | 1.000000 | 0.056510 | 0.172308 |
| required_car_parking_spaces | -0.195701 | -0.116624 | -0.013812 | 0.001980 | 0.008569 | -0.018520 | -0.024933 | 0.014438 | 0.056247 | 0.037389 | 0.077928 | -0.018540 | 0.047506 | 0.067490 | 0.119282 | 0.038638 | -0.030601 | 0.056510 | 1.000000 | 0.082718 |
| total_of_special_requests | -0.234877 | -0.095949 | 0.108610 | 0.026202 | 0.003026 | 0.073124 | 0.068738 | 0.123353 | 0.081747 | 0.097939 | 0.012963 | -0.048488 | 0.037775 | 0.055003 | 0.060783 | -0.090790 | -0.082755 | 0.172308 | 0.082718 | 1.000000 |
# we only need the "is_canceled" corelation with other attributes;
corelation = df.corr()['is_canceled'].abs().sort_values(ascending = False)[1:]
corelation
lead_time 0.292876 total_of_special_requests 0.234877 required_car_parking_spaces 0.195701 booking_changes 0.144832 previous_cancellations 0.110139 is_repeated_guest 0.083745 company 0.083594 adults 0.058182 previous_bookings_not_canceled 0.057365 days_in_waiting_list 0.054301 agent 0.046770 adr 0.046492 babies 0.032569 stays_in_week_nights 0.025542 arrival_date_year 0.016622 arrival_date_week_number 0.008315 arrival_date_day_of_month 0.005948 children 0.004851 stays_in_weekend_nights 0.001323 Name: is_canceled, dtype: float64
From this list it is apparent that lead_time, total_of_special_requests, required_car_parking_spaces, booking_changes and previous_cancellations are the 5 most important numerical features. However, to predict whether or not a booking will be canceled, the number of booking changes is a possible source of leakage, because this information can change over time. I will also not include "days_in_waiting_list" and "arrival_date_year".
The most important feature to exclude is the "reservation_status":
df.groupby("is_canceled")["reservation_status"].value_counts()
is_canceled reservation_status
0 Check-Out 75011
1 Canceled 42993
No-Show 1206
Name: reservation_status, dtype: int64
exlusion_list = ["days_in_waiting_list", "arrival_date_year"]
# unnecessary features that are better removed;
#either they have tiny impact on our model (low corelation)
# or have the same value in their column (all values are 2015 in the year column)
# we need to find out which columns are numerical values: (or not object)
# we need to define a loop to assess each column:
numerical_features = [col for col in df.columns if df[col].dtype!="O" and col not in exlusion_list]
numerical_features = numerical_features[:-1] # removing the date/time from numerical features
numerical_data = df[numerical_features]
# categorical features that need to be removed due to their tiny impact:
exclusion_list_cat = ["arrival_date_year", "assigned_room_type", "booking_changes", "reservation_status","country", "days_in_waiting_list"]
# using a loop to define the categorical features (columns):
categorical_features = [col for col in df.columns if df[col].dtype =='O' and col not in exclusion_list_cat]
categorical_features.append("reservation_status_date") #we put the date/time here as categorical feature
categorical_features
['hotel', 'arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type', 'reservation_status_date']
# creating a new dataset for categorical features:
data_cat = df[categorical_features]
# exctracting month/year/day from date column:
data_cat["res_year"] = data_cat["reservation_status_date"].dt.year
data_cat["res_month"] = data_cat["reservation_status_date"].dt.month
data_cat["res_day"] = data_cat["reservation_status_date"].dt.day
#dropping the "reservation_status_date" column:
data_cat.drop("reservation_status_date", axis = 1, inplace = True)
# adding "is_canceled" column to the categorical dataset:
data_cat ["cancellation"] = df["is_canceled"]
data_cat.head()
| hotel | arrival_date_month | meal | market_segment | distribution_channel | reserved_room_type | deposit_type | customer_type | res_year | res_month | res_day | cancellation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | July | BB | Direct | Direct | C | No Deposit | Transient | 2015 | 7 | 1 | 0 |
| 1 | Resort Hotel | July | BB | Direct | Direct | C | No Deposit | Transient | 2015 | 7 | 1 | 0 |
| 2 | Resort Hotel | July | BB | Direct | Direct | A | No Deposit | Transient | 2015 | 7 | 2 | 0 |
| 3 | Resort Hotel | July | BB | Corporate | Corporate | A | No Deposit | Transient | 2015 | 7 | 2 | 0 |
| 4 | Resort Hotel | July | BB | Online TA | TA/TO | A | No Deposit | Transient | 2015 | 7 | 3 | 0 |
In this stage, we use "Mean Encoding" technique to care of the categorical variables. By mean encoding we convert the categorical variables to numbers (means) with respect to the "cancellation" column. It is basically grouping a column with respect to another column and then calculating the mean:
# we do the mean encoding on all the columns except "cancellation" [0:8]
inc_cols = data_cat.columns[0:8]
# mean encoding on all columns with loop:
for col in inc_cols:
dict = data_cat.groupby(col)["cancellation"].mean()
data_cat[col] = data_cat[col].map(dict) #mapping the mean values to the columns
data_cat.head()
| hotel | arrival_date_month | meal | market_segment | distribution_channel | reserved_room_type | deposit_type | customer_type | res_year | res_month | res_day | cancellation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.277674 | 0.374644 | 0.374106 | 0.153712 | 0.174868 | 0.330827 | 0.28402 | 0.407864 | 2015 | 7 | 1 | 0 |
| 1 | 0.277674 | 0.374644 | 0.374106 | 0.153712 | 0.174868 | 0.330827 | 0.28402 | 0.407864 | 2015 | 7 | 1 | 0 |
| 2 | 0.277674 | 0.374644 | 0.374106 | 0.153712 | 0.174868 | 0.391567 | 0.28402 | 0.407864 | 2015 | 7 | 2 | 0 |
| 3 | 0.277674 | 0.374644 | 0.374106 | 0.187618 | 0.220568 | 0.391567 | 0.28402 | 0.407864 | 2015 | 7 | 2 | 0 |
| 4 | 0.277674 | 0.374644 | 0.374106 | 0.367590 | 0.410598 | 0.391567 | 0.28402 | 0.407864 | 2015 | 7 | 3 | 0 |
final_data = pd.concat([data_cat, numerical_data], axis = 1)
#removing the "is_canceled" drop due to the smililarity to the "cancellation" column;
final_data.drop("is_canceled",axis = 1, inplace = True)
print(final_data.columns)
print(final_data.shape)
Index(['hotel', 'arrival_date_month', 'meal', 'market_segment',
'distribution_channel', 'reserved_room_type', 'deposit_type',
'customer_type', 'res_year', 'res_month', 'res_day', 'cancellation',
'lead_time', 'arrival_date_week_number', 'arrival_date_day_of_month',
'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
'babies', 'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company',
'adr', 'required_car_parking_spaces', 'total_of_special_requests'],
dtype='object')
(119210, 29)
final_data.dropna(inplace=True)
# based on the nature of our data, it is easy to guess in which columns we might have outliers!
# we need to check "lead_time" and "adr"
sns.distplot(final_data["lead_time"])
<AxesSubplot:xlabel='lead_time', ylabel='Density'>
sns.distplot(final_data["adr"])
<AxesSubplot:xlabel='adr', ylabel='Density'>
In order to normilize the data in these columns, we use log;
# defining a funtion to handle the outliers:
def handle_outlier(data,col):
data[col] = np.log1p(data[col])
handle_outlier(final_data,"adr")
handle_outlier(final_data,"lead_time")
# checking the results:
sns.distplot(final_data["adr"])
<AxesSubplot:xlabel='adr', ylabel='Density'>
sns.distplot(final_data["lead_time"])
<AxesSubplot:xlabel='lead_time', ylabel='Density'>
final_data.dropna(inplace = True)
y = final_data["cancellation"]
x = final_data.drop("cancellation", axis = 1)
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
feature_selection_model = SelectFromModel(Lasso(alpha = 0.005, random_state = 0))
# alpha is the penalty parameter (is a hyper parameter) which means the bigger the value, the less number of features will be selected;
#fitting the model:
feature_selection_model.fit(x,y)
SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))
feature_selection_model.get_support()
# False means the feature is not recommended for model and does not do much! and True is the other way around :D
array([False, False, False, False, False, False, True, False, True,
True, True, True, True, False, False, False, True, True,
False, False, True, False, True, False, True, True, True,
True])
# let's find out which features are selected? we can do this by passing a filter to the "x" columns:
selected_features = x.columns[feature_selection_model.get_support()]
selected_features = selected_features.drop('res_year') #got higher scores when removed!!
x = x[selected_features]
x.head()
| deposit_type | res_month | res_day | lead_time | arrival_date_week_number | adults | children | previous_cancellations | booking_changes | company | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.28402 | 7 | 1 | 5.837730 | 27 | 2 | 0.0 | 0 | 3 | 0.0 | 0.000000 | 0 | 0 |
| 1 | 0.28402 | 7 | 1 | 6.603944 | 27 | 2 | 0.0 | 0 | 4 | 0.0 | 0.000000 | 0 | 0 |
| 2 | 0.28402 | 7 | 2 | 2.079442 | 27 | 1 | 0.0 | 0 | 0 | 0.0 | 4.330733 | 0 | 0 |
| 3 | 0.28402 | 7 | 2 | 2.639057 | 27 | 1 | 0.0 | 0 | 0 | 0.0 | 4.330733 | 0 | 0 |
| 4 | 0.28402 | 7 | 3 | 2.708050 | 27 | 2 | 0.0 | 0 | 0 | 0.0 | 4.595120 | 0 | 1 |
#preparing the datasets:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, random_state = 1)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
models =[]
models.append(("Logistic Regression", LogisticRegression()))
models.append(("Naive Bayes", GaussianNB()))
models.append(("RandomForest", RandomForestClassifier()))
models.append(("Decision Tree", DecisionTreeClassifier()))
models.append(("KNN", KNeighborsClassifier()))
for name,model in models:
print (">"*10 + "\t"+ name)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(f"\nConfusion Matrix:")
print(confusion_matrix(predictions,y_test))
print("\n")
print("Accuracy Score:\t")
print(accuracy_score(predictions,y_test))
print("\n"+"-"*60+"\n")
>>>>>>>>>> Logistic Regression Confusion Matrix: [[17892 4997] [ 900 6014]] Accuracy Score: 0.8021340133543603 ------------------------------------------------------------ >>>>>>>>>> Naive Bayes Confusion Matrix: [[ 8426 1146] [10366 9865]] Accuracy Score: 0.6137301613931483 ------------------------------------------------------------ >>>>>>>>>> RandomForest Confusion Matrix: [[18653 1206] [ 139 9805]] Accuracy Score: 0.9548703150689528 ------------------------------------------------------------ >>>>>>>>>> Decision Tree Confusion Matrix: [[17960 799] [ 832 10212]] Accuracy Score: 0.9452739657081501 ------------------------------------------------------------ >>>>>>>>>> KNN Confusion Matrix: [[18571 1398] [ 221 9613]] Accuracy Score: 0.9456766097372747 ------------------------------------------------------------